import pandas as pd
import numpy as np
df = pd.read_csv('data/materials-pages', error_bad_lines = False)
df.head()
pageDict = {" ".join(page.split('_')):0 for page in df['title']}
pageDict
#this works in python 2.7 but not in 3.6
#try finding files by title
import os
import bz2
import json
import time
t0 = time.time()
if 0==0:
writefile = "data/wiki_proc_filt.json"
written = 0
with open(writefile, 'w') as fwrite:
dirs = os.listdir('data/wiki_processed')
for folder in dirs:
filelist = os.listdir("data//wiki_processed//"+folder)
for filename in filelist:
filepath = "data//wiki_processed//" + "//".join([folder, filename])
with bz2.BZ2File(filepath, 'r') as fread:
for line in fread:
d = json.loads(line)
if d['title'] in pageDict:
written +=1
fwrite.write(line)
print("Searched for " + str(len(pageDict)) + " titles.")
print(str(written) + " titles found and written to " + writefile)
t = time.time()
print("time taken = " + str(t-t0) + "s.")
Our list of articles in the category 'materials' with depth = 5 had 197530 articles. Searching by title, only 164600 of those were found in the processed wikipedia dump. This might be because the catscan is more recent than the dump.
Let's try to search instead by "id" rather than "title".
#this works in python 2.7 but not in 3.6
import time
t0 = time.time()
#make the if statement true if you want to run this. May take a few minutes.
if 1==1:
idDict = {id:0 for id in df["pageid"]}
writefile = "data/wiki_proc_filt_id.json"
writtenId = 0
with open(writefile, 'w') as fwrite:
dirs = os.listdir('data/wiki_processed')
for folder in dirs:
filelist = os.listdir("data//wiki_processed//"+folder)
for filename in filelist:
filepath = "data//wiki_processed//" + "//".join([folder, filename])
with bz2.BZ2File(filepath, 'r') as fread:
for line in fread:
d = json.loads(line)
if int(d['id']) in idDict:
writtenId +=1
fwrite.write(line)
print("Searched for " + str(len(idDict)) + " titles.")
print(str(writtenId) + " titles found and written to " + writefile)
t = time.time()
print("Time taken: " + str(t-t0) + "s")
Even after searching by id, about 27000 articles are missing. We will go ahead and process the articles we could find.
There is one complication in the dump I have processed. We need the dump with the links, but the one I have is without links. The idea behind preserving the links is that we should convert phrases within a link into a single entity - an n-gram. So below I obtained a different pre-processed dump.
This file is tab separated. Each section is written as a new entry. Also contains references.
import re
def join_links(example):
pattern = re.compile(r'(<a href\")(.*?)(\">)(.*?)(</a>)')
matches = pattern.findall(example)
for match in matches:
#print(match)
t = ''.join(x for x in match)
example = example.replace(t, match[1].replace(" ","_"))
return example
#print(example)
readfile = open("data//wiki_materials_rows_out.txt", 'r')
tempfile = open("data/link_joined.txt", 'w')
t0 = time.time()
i=0
for line in readfile:
line = join_links(line)
tempfile.write(line)
t = time.time()
print("Time taken = " + str(t-t0) + "s.")
tempfile.close()
Let's take a look at how the file looks now
readfile = open("data//link_joined.txt", 'r')
i=0
for line in readfile:
if i<2:
example=line
elif i<5:
print(line)
else:
break
i+=1
readfile.close()
For ease of working, convert the tab separated .txt file into json file
t0 = time.time()
lines_read = 0
lines_write = 0
with open("data/link_joined.txt", 'r') as readfile:
with open("data/link_joined.json", 'w') as writefile:
for line in readfile:
lines_read+=1
data = line.split('\t')
if len(data)==6:
lines_write += 1
line = {"id" : data[0], "title" : data[1], "section" : data[2], "text" : data[5]}
d = json.dumps(line)
writefile.write(d+'\n')
print(str(lines_read) + " lines read.")
print(str(lines_write) + " lines written. ")
t = time.time()
print("time taken = " + str(t-t0) + "s.")
Make sure we still have all the articles we started with ~166000
title_dict = {}
with open("data/link_joined.json") as f_r: # | wc -l
i = 0
for line in f_r:
#if i>10:
# break
d = json.loads(line)
title_dict[d['title']] = 1
i+=1
print(len(title_dict))
Fuse text for same title together Assume that text from the same article is consecutive. Once an article ends, it never restarts again
#n_line = 0
t0 = time.time()
titles = 0
n_line = 1
with open("data/link_joined.json", 'r') as readfile:
with open("data/titles_joined.json", 'w') as writefile:
for line in readfile:
if 0==0:
d = json.loads(line)
n_line += 1
if n_line==1:
id_prev = d["id"]
title_prev = d["title"]
text_prev = d["text"]
section_prev = d["section"]
sections = d["section"]
#new_title = 0
elif d["id"] == id_prev:
new_title = 0
text_prev = text_prev + "\n" + d["text"]
if d["section"] != section_prev:
sections = sections + "," + d["section"]
else:
titles+=1
#new_title = 1
#print(title_prev)
writeline = {"id": id_prev, "title": title_prev, "section": sections, "text": text_prev}
writeline = json.dumps(writeline)
writefile.write(writeline + '\n')
id_prev = d["id"]
title_prev = d["title"]
text_prev = d["text"]
section_prev = d["section"]
sections = d["section"]
print(str(titles) + " written.")
t = time.time()
print("time taken = " + str(t-t0) + "s.")
Make a list of titles
titles = {}
with open('data/titles_joined.json', 'r') as readfile:
for line in readfile:
d = json.loads(line)
title = d['title'].replace(" ", "_")
titles[title] = 0
len(titles)
titles.keys()[5]
WikiExtractor does not import the category information for each article by default. However dbpedia stores a lot of structured information about each wikipedia article such as category names, templates, info-boxes etc.
Let's get the categories info from dbpedia.
from collections import defaultdict
import warnings
with warnings.catch_warnings():
warnings.simplefilter('error')
t0 = time.time()
titles_cat = defaultdict(list)
with open('data/article_categories_en.ttl', 'r') as readfile:
n=0
for line in readfile:
if n==0 or len(line.split(">"))<2:
n+=1
continue
n=n+1
t_db = line.split("<http://dbpedia.org/resource/")[1].split(">")[0]
t_db = t_db.decode('utf-8')
if t_db in titles:
cat = line.split("<http://dbpedia.org/resource/")[2].split(":")[1].split(">")[0]
titles_cat[t_db].append(cat)
if 0==1:
break
t = time.time()
print("Extracted categories for " + str(len(titles_cat.keys())) + " titles in " + str(t-t0) + "s.")
notfound = []
for key in titles.keys():
if key in titles_cat:
continue
else:
notfound.append(key)
for x in notfound:
print(x)
#print(x.encode('ascii', 'ignore'))
with open('data/titles_cat.json', 'w') as writefile:
json.dump(titles_cat, writefile)
t0 = time.time()
titles = 0
with open('data/wiki_final_processed.json', 'w') as writefile:
with open ('data/titles_joined.json', 'r') as readfile:
for line in readfile:
d = json.loads(line)
d['title'] = d['title'].replace(" ", "_")
if d['title'] in titles_cat:
titles += 1
d['category'] = titles_cat[d['title']]
writeline = json.dumps(d)
writefile.write(writeline + '\n')
t = time.time()
print("Wrote categories for " + str(titles) + " articles in " + str(t-t0) + "s.")
We have a list of articles under each category. Let's also get a list of categories for each article, by storing in a data structure called default dict.
import json
from collections import defaultdict
ids = []
categories = []
n=0
cat_titles = defaultdict(list)
with open('data/wiki_final_processed.json','r') as readfile:
for line in readfile:
d = json.loads(line)
n+=1
for category in d['category']:
cat_titles[category].append(d['title'])
We will store this in a pandas dataframe for easy filtering and querying.
cats = pd.DataFrame.from_dict(cat_items, orient = 'index')
stemmer2.stem('gardening')
cats_df = pd.DataFrame(list(cat_titles.items()), columns=['category', 'articles'])
cats_df.head()
Sort the dataframe based on the number of articles in each category, so that we can focus on categories with the most articles.
cats_df['n_articles'] = [len(cats_df.iloc[i][1]) for i in range(len(cats_df))]
cats_df.sort_values('n_articles', axis=0, ascending=False, inplace=True, na_position='last')
cats_df.to_csv('data\cats_articles', index=False, header = True)
#cats_df = pd.read_csv('data\cats_articles')
cats_df.head()
cats_df.iloc[:300, [0,2]].to_csv('data\cats_articles300', index = False)
import spacy
import time
import json
nlp = spacy.load('en')
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)
t0 = time.time()
count = 0
with open('data/wiki_final_processed.json', 'r') as readfile:
with open('data/wiki_tokenized.json', 'w') as writefile:
for line in readfile:
d = json.loads(line)
tokens = [str(token) for token in tokenizer(d['text'])]
d['text'] = " ".join(tokens)
writefile.write(json.dumps(d) + '\n')
count += 1
if count%10000 == 0:
print(d['text'][:30], tokens[:4])
print("Tokenized", count, "articles")
t= time.time()
print(t-t0)
import nltk
nltk.download('stopwords')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)
stemmer.stem('running')
From among the 300 most frequent categories, I have picked 5 categories to train a classifier on. For a single label classifier, let's make sure that the categories don't share a significant number of articles.
choose = ['Fluid_dynamics', 'Quantum_mechanics', 'Condensed_matter_physics', 'Limestone_caves', 'Acoustics']
#from sets import Set
intersect = []
for i, cat_1 in enumerate(choose):
for j, cat_2 in enumerate(choose):
if i<=j:
continue
art_1 = set(cats_df[cats_df['category' ] == cat_1]['articles'].tolist()[0])
#print(art_1)
art_2 = set(cats_df[cats_df['category' ] == cat_2]['articles'].tolist()[0])
#intersect.append(len(list(art_1.intersection(art_2))))
print(cat_1, cat_2, len(list(art_1.intersection(art_2))))
Some categories have some common articles, but less than 10% are common. Notably, :
Let's write all the tokenized text for this selected set of articles to a file.
arts = defaultdict(list)
for cat in choose:
articles = cats_df[cats_df['category']==cat]['articles'].tolist()[0]
for art in articles:
arts[art].append(cat)
cats_df[cats_df['category']==choose[0]]['articles'].tolist()[0]
import json
written = 0
with open('data\wiki_tokenized.json', 'r') as readfile:
with open('data\physchemx.json', 'w') as writefile:
for line in readfile:
d = json.loads(line)
if d['title'] in arts:
written+= 1
d['category'] = arts[d['title']]
writeline = json.dumps(d)
writefile.write(writeline + '\n')
print("written", written, "articles")
import pandas as pd
data = pd.read_json('data\physchemx.json', lines = True)
data.head()
titles = data['title']
X_raw = data['text']
y_raw = data['category']
y = [y[0] for y in y_raw] #keep only 1st category if there are multiple categories for a title
The function text_process removes punctuation and stop-words. This will be passed to the count-vectorizer from sklearn.
import string
from spacy.lang.en.stop_words import STOP_WORDS
punc_dict = dict((s,1) for s in string.punctuation)
stop_dict = dict((c,1) for c in STOP_WORDS)
stop_dict['\n\n'] = 1
stop_dict['\n'] = 1
punc_dict.pop('_', None)
#make all lowercase, remove punctuation and stop words
def text_process(text, punc_dict= punc_dict, stop_dict=stop_dict):
nopunc = ""
text = text.lower()
for char in text:
if char not in punc_dict:
nopunc += char
processed = [word for word in nopunc.split(" ") if word not in stop_dict]
return processed
Let's see what it does. It creates a list of words - tokens for each text that is passed to it.
text_process(X_raw[1])
from sklearn.model_selection import train_test_split
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X_raw,y, test_size = 0.3, random_state = 101)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = text_process)
vector_fit = vectorizer.fit(X_raw)
X = vector_fit.transform(X_raw)
X_train = vector_fit.transform(X_train_raw)
X_test = vector_fit.transform(X_test_raw)
density= X.nnz/X.shape[0]/X.shape[1]*100
print(density)
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
fit = model.fit(X_train, y_train)
predict = model.predict(X_test)
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))
A simple Naive Bayes classifier is able to do pretty well on identifying the correct category for each article. The precision is very high for classes with no overlaps, while quantum mechanics and condensed matter physics have relatively lower precision. This is in-fact related to noisyness of the data itself. Condensed_matter_physics and Quantum_mechanics share 17 articles in common, but we forced only one of those categories for each of those articles - this introduced noise in the training data itself.
Let's look at the list of top 100 categories with the most articles. Recall that this list of articles was obtained by traversing the wikipedia graph starting at the categories 'materials' and traversing up to depth 5, with the motive of getting a broad range of articles about materials science and related topics such as chemistry, engineering and life-science.
However, looking at the top 100 categories,
Thus the problem statement is to do an unsupervised clustering of these categories.
import pandas as pd
cats_df = pd.read_csv('data\cats_articles')
cats_df.head()
cats_df[:100]
Choice of features:
Choice of clustering algorithms
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.word2vec import LineSentence
import pyLDAvis
import pyLDAvis.gensim
import warnings
import os
import numpy as np
import _pickle as pickle
import json
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_numeric
from spacy.lang.en.stop_words import STOP_WORDS
stop_dict = dict((c,1) for c in STOP_WORDS)
stop_dict['\n\n'] = 1
stop_dict['\n'] = 1
import time
t0 = time.time()
corpus = []
dct = Dictionary(corpus)
with open('data/wiki_tokenized.json', 'r') as readfile:
with open('data/wiki_nostem.json', 'w') as writefile:
for i, line in enumerate(readfile):
if 5==5:
d = json.loads(line)
text = strip_punctuation(strip_numeric(d['text']))
text = " ".join([word for word in text.split(" ") if word not in stop_dict])
d['text'] = text
writeline = json.dumps(d)
writefile.write(writeline + '\n')
text = text.split(" ")
text = np.array(text).reshape(1,-1)
dct.add_documents(text)
if i%10000 == 0 and i!=0:
print(i, " records processed. Length of dict = ", len(dct) )
dct.filter_extremes(keep_n=800000)
dct.compactify()
print("filtered to length ", len(dct))
dct.filter_extremes(keep_n=800000)
dct.compactify()
t= time.time()
print(t-t0)
gensim_dict_filepath = os.path.join("data",'gensim_dict_nostem.dict')
#dct.save_as_text(gensim_dict_filepath)
dct.save(gensim_dict_filepath)
gensim_dict_filepath = os.path.join("data",'gensim_dict_nostem.dict')
#dct = Dictionary.load_from_text(gensim_dict_filepath)
dct = Dictionary.load(gensim_dict_filepath)
The LDA implementation below is based on a tutorial by Patrick Harrison - https://youtu.be/6zm9NC9uRkk, accompanied by a Github notebook - https://github.com/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb
bow_filepath = os.path.join('data','bow_corpus.mm')
def bow_generator(filepath):
"""
generator function to read reviews from a file
and yield a bag-of-words representation
"""
with open(filepath, 'r') as readfile:
if 0==0:
for line in readfile:
d = json.loads(line)
text = d['text'].split()
yield dct.doc2bow(text)
#for review in LineSentence(filepath):
#yield trigram_dictionary.doc2bow(review)
%%time
if 0 == 1:
# generate bag-of-words representations for
# all text and save them as a matrix
MmCorpus.serialize(bow_filepath,
bow_generator('data/wiki_nostem.json'))
# load the finished bag-of-words corpus from disk
bow_corpus = MmCorpus(bow_filepath)
%%time
lda_filepath = os.path.join('data', 'lda')
if 1 == 0:
with warnings.catch_warnings():
warnings.simplefilter('ignore')
# workers => sets the parallelism, and should be
# set to your number of physical cores minus one
lda = LdaMulticore(bow_corpus,
num_topics=100,
id2word=dct,
workers=4)
lda.save(lda_filepath)
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_filepath)
def explore_topic(topic_number, topn=25):
"""
accept a user-supplied topic number and
print out a formatted list of the top terms
"""
print( u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for term, frequency in lda.show_topic(topic_number, topn=25):
print (u'{:20} {:.3f}'.format(term, round(frequency, 5)))
explore_topic(topic_number=28)
%%time
LDAvis_data_filepath = os.path.join('data', 'ldavis_prepared')
# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 0:
LDAvis_prepared = pyLDAvis.gensim.prepare(lda, bow_corpus,
dct)
#with open(LDAvis_data_filepath, 'wb') as f:
#pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
#with open(LDAvis_data_filepath) as f:
#LDAvis_prepared = pickle.load(f)
pyLDAvis.display(LDAvis_prepared)
Looking at the visualization above, topic modeling seems like a very promising approach in separating out the articles. Specifically, in the visualization below,
topics on the left are more social science, politics, literature related while topics on the right are primarily talk about more sciency articles.
There are a large number of small topics in the center, these are harder to classify in one of the two categories - science or social science.